Loding important packages

library(readr,  warn.conflicts=F)
library(RColorBrewer,  warn.conflicts=F) #Rcolorbrewer palette
library(corrplot,  warn.conflicts=F)
## corrplot 0.84 loaded
library(ggcorrplot,  warn.conflicts=F)
## Loading required package: ggplot2
library(plotly,  warn.conflicts=F)
library(ggplot2, warn.conflicts=F)
library(reshape, warn.conflicts=F)
library(viridis, warn.conflicts=F)
## Loading required package: viridisLite
library(tidyverse, warn.conflicts=F)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble  3.0.5     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.2     ✓ stringr 1.4.0
## ✓ purrr   0.3.4     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::rename() masks reshape::rename(), plotly::rename()
library(hrbrthemes, warn.conflicts=F)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(psych, warn.conflicts=F)
library(class, warn.conflicts=F)
library(caret, warn.conflicts = F)
## Loading required package: lattice
library(DescTools)
## 
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
## 
##     MAE, RMSE
## The following objects are masked from 'package:psych':
## 
##     AUC, ICC, SD
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:psych':
## 
##     alpha
## The following object is masked from 'package:purrr':
## 
##     cross
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(caret)
library(Matrix)
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## The following object is masked from 'package:reshape':
## 
##     expand
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
set.seed(123456789)

 The data was created by Dr. William H. Wolberg, W. Nick Street, and Olvi L. Mangasarian and then uploaded to Kaggle by Street in 1995. The data set describes characteristics of the cell nuclei of breast mass. The nuclei are seen through digitalized images from a fine needle aspirate (FNA) of a breast mass. FNA is a type of biopsy performed to gain samples of tissue and fluid from breast lesions using a twenty-one to twenty-five gauge needle. It is one of the ways doctors choose to diagnosis breast cancer without removing the mass first.

text_tbl <- data.frame(
Variable = c("Diagnosis", "Radius", "Texture", "Perimeter", "Area", "Smoothness", "Compactness", "Concavity", "Concave Points", "Symmetry", "Fractural Dimension"), 
Description = c(
    "M for malignant or B for benign",
    "The mean of three measured distances from center to perimeter",
    "The standard deviation of gray-scale values",
    "The measure of the distance around the boundary of the nuclei",
    "The measure of the surface of the nuclei",
    "The variation in radius lengths",
    "The measure equal to the perimeter squared divided by the area all minus one",
    "The severity of concave portions on the contour",
    "The number of concave portions of the contour",
    "The measure of the likeness across any diameter of the nuclei",
    "The measure of the “coastline approximation” minus one"
) )
kbl(text_tbl, booktabs = T) %>% kable_styling(full_width = F) %>% column_spec(1, bold = T) %>% column_spec(2, width = "30em")
Variable Description
Diagnosis M for malignant or B for benign
Radius The mean of three measured distances from center to perimeter
Texture The standard deviation of gray-scale values
Perimeter The measure of the distance around the boundary of the nuclei
Area The measure of the surface of the nuclei
Smoothness The variation in radius lengths
Compactness The measure equal to the perimeter squared divided by the area all minus one
Concavity The severity of concave portions on the contour
Concave Points The number of concave portions of the contour
Symmetry The measure of the likeness across any diameter of the nuclei
Fractural Dimension The measure of the “coastline approximation” minus one

Importing Dataset

data <- read_csv("~/Downloads/data.csv")
## Warning: Missing column names filled in: 'X33' [33]
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_double(),
##   diagnosis = col_character(),
##   X33 = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 569 parsing failures.
## row col   expected     actual                   file
##   1  -- 33 columns 32 columns '~/Downloads/data.csv'
##   2  -- 33 columns 32 columns '~/Downloads/data.csv'
##   3  -- 33 columns 32 columns '~/Downloads/data.csv'
##   4  -- 33 columns 32 columns '~/Downloads/data.csv'
##   5  -- 33 columns 32 columns '~/Downloads/data.csv'
## ... ... .......... .......... ......................
## See problems(...) for more details.
data
## # A tibble: 569 x 33
##        id diagnosis radius_mean texture_mean perimeter_mean area_mean
##     <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
##  1 8.42e5 M                18.0         10.4          123.      1001 
##  2 8.43e5 M                20.6         17.8          133.      1326 
##  3 8.43e7 M                19.7         21.2          130       1203 
##  4 8.43e7 M                11.4         20.4           77.6      386.
##  5 8.44e7 M                20.3         14.3          135.      1297 
##  6 8.44e5 M                12.4         15.7           82.6      477.
##  7 8.44e5 M                18.2         20.0          120.      1040 
##  8 8.45e7 M                13.7         20.8           90.2      578.
##  9 8.45e5 M                13           21.8           87.5      520.
## 10 8.45e7 M                12.5         24.0           84.0      476.
## # … with 559 more rows, and 27 more variables: smoothness_mean <dbl>,
## #   compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## #   symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## #   texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, X33 <chr>

Looking at dataset

head(data)
## # A tibble: 6 x 33
##       id diagnosis radius_mean texture_mean perimeter_mean area_mean
##    <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
## 1 8.42e5 M                18.0         10.4          123.      1001 
## 2 8.43e5 M                20.6         17.8          133.      1326 
## 3 8.43e7 M                19.7         21.2          130       1203 
## 4 8.43e7 M                11.4         20.4           77.6      386.
## 5 8.44e7 M                20.3         14.3          135.      1297 
## 6 8.44e5 M                12.4         15.7           82.6      477.
## # … with 27 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## #   concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## #   fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## #   perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>, X33 <chr>

Columns in dataset

colnames(data)
##  [1] "id"                      "diagnosis"              
##  [3] "radius_mean"             "texture_mean"           
##  [5] "perimeter_mean"          "area_mean"              
##  [7] "smoothness_mean"         "compactness_mean"       
##  [9] "concavity_mean"          "concave points_mean"    
## [11] "symmetry_mean"           "fractal_dimension_mean" 
## [13] "radius_se"               "texture_se"             
## [15] "perimeter_se"            "area_se"                
## [17] "smoothness_se"           "compactness_se"         
## [19] "concavity_se"            "concave points_se"      
## [21] "symmetry_se"             "fractal_dimension_se"   
## [23] "radius_worst"            "texture_worst"          
## [25] "perimeter_worst"         "area_worst"             
## [27] "smoothness_worst"        "compactness_worst"      
## [29] "concavity_worst"         "concave points_worst"   
## [31] "symmetry_worst"          "fractal_dimension_worst"
## [33] "X33"

Checking for null values

##lapply(data,function(x) { length(which(is.na(x)))})
skimr::skim(data) ##Among 2 character variables, no missing values found for diagnosis variable, found 569 missing values found for X33 character variable and  found 31 numeric variable
Data summary
Name data
Number of rows 569
Number of columns 33
_______________________
Column type frequency:
character 2
numeric 31
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
diagnosis 0 1 1 1 0 2 0
X33 569 0 NA NA 0 0 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
id 0 1 30371831.43 125020585.61 8670.00 869218.00 906024.00 8813129.00 911320502.00 ▇▁▁▁▁
radius_mean 0 1 14.13 3.52 6.98 11.70 13.37 15.78 28.11 ▂▇▃▁▁
texture_mean 0 1 19.29 4.30 9.71 16.17 18.84 21.80 39.28 ▃▇▃▁▁
perimeter_mean 0 1 91.97 24.30 43.79 75.17 86.24 104.10 188.50 ▃▇▃▁▁
area_mean 0 1 654.89 351.91 143.50 420.30 551.10 782.70 2501.00 ▇▃▂▁▁
smoothness_mean 0 1 0.10 0.01 0.05 0.09 0.10 0.11 0.16 ▁▇▇▁▁
compactness_mean 0 1 0.10 0.05 0.02 0.06 0.09 0.13 0.35 ▇▇▂▁▁
concavity_mean 0 1 0.09 0.08 0.00 0.03 0.06 0.13 0.43 ▇▃▂▁▁
concave points_mean 0 1 0.05 0.04 0.00 0.02 0.03 0.07 0.20 ▇▃▂▁▁
symmetry_mean 0 1 0.18 0.03 0.11 0.16 0.18 0.20 0.30 ▁▇▅▁▁
fractal_dimension_mean 0 1 0.06 0.01 0.05 0.06 0.06 0.07 0.10 ▆▇▂▁▁
radius_se 0 1 0.41 0.28 0.11 0.23 0.32 0.48 2.87 ▇▁▁▁▁
texture_se 0 1 1.22 0.55 0.36 0.83 1.11 1.47 4.88 ▇▅▁▁▁
perimeter_se 0 1 2.87 2.02 0.76 1.61 2.29 3.36 21.98 ▇▁▁▁▁
area_se 0 1 40.34 45.49 6.80 17.85 24.53 45.19 542.20 ▇▁▁▁▁
smoothness_se 0 1 0.01 0.00 0.00 0.01 0.01 0.01 0.03 ▇▃▁▁▁
compactness_se 0 1 0.03 0.02 0.00 0.01 0.02 0.03 0.14 ▇▃▁▁▁
concavity_se 0 1 0.03 0.03 0.00 0.02 0.03 0.04 0.40 ▇▁▁▁▁
concave points_se 0 1 0.01 0.01 0.00 0.01 0.01 0.01 0.05 ▇▇▁▁▁
symmetry_se 0 1 0.02 0.01 0.01 0.02 0.02 0.02 0.08 ▇▃▁▁▁
fractal_dimension_se 0 1 0.00 0.00 0.00 0.00 0.00 0.00 0.03 ▇▁▁▁▁
radius_worst 0 1 16.27 4.83 7.93 13.01 14.97 18.79 36.04 ▆▇▃▁▁
texture_worst 0 1 25.68 6.15 12.02 21.08 25.41 29.72 49.54 ▃▇▆▁▁
perimeter_worst 0 1 107.26 33.60 50.41 84.11 97.66 125.40 251.20 ▇▇▃▁▁
area_worst 0 1 880.58 569.36 185.20 515.30 686.50 1084.00 4254.00 ▇▂▁▁▁
smoothness_worst 0 1 0.13 0.02 0.07 0.12 0.13 0.15 0.22 ▂▇▇▂▁
compactness_worst 0 1 0.25 0.16 0.03 0.15 0.21 0.34 1.06 ▇▅▁▁▁
concavity_worst 0 1 0.27 0.21 0.00 0.11 0.23 0.38 1.25 ▇▅▂▁▁
concave points_worst 0 1 0.11 0.07 0.00 0.06 0.10 0.16 0.29 ▅▇▅▃▁
symmetry_worst 0 1 0.29 0.06 0.16 0.25 0.28 0.32 0.66 ▅▇▁▁▁
fractal_dimension_worst 0 1 0.08 0.02 0.06 0.07 0.08 0.09 0.21 ▇▃▁▁▁

We can notice, that there seems to be three category in dataset. They’re: mean, se and worst

DATA WRANGLING Deleting X column as it seems to be a mistake while importing the dataset

drops <- c("X33")
data <- data[ , !(names(data) %in% drops)]
data
## # A tibble: 569 x 32
##        id diagnosis radius_mean texture_mean perimeter_mean area_mean
##     <dbl> <chr>           <dbl>        <dbl>          <dbl>     <dbl>
##  1 8.42e5 M                18.0         10.4          123.      1001 
##  2 8.43e5 M                20.6         17.8          133.      1326 
##  3 8.43e7 M                19.7         21.2          130       1203 
##  4 8.43e7 M                11.4         20.4           77.6      386.
##  5 8.44e7 M                20.3         14.3          135.      1297 
##  6 8.44e5 M                12.4         15.7           82.6      477.
##  7 8.44e5 M                18.2         20.0          120.      1040 
##  8 8.45e7 M                13.7         20.8           90.2      578.
##  9 8.45e5 M                13           21.8           87.5      520.
## 10 8.45e7 M                12.5         24.0           84.0      476.
## # … with 559 more rows, and 26 more variables: smoothness_mean <dbl>,
## #   compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## #   symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## #   texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## #   compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## #   symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## #   texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## #   smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## #   `concave points_worst` <dbl>, symmetry_worst <dbl>,
## #   fractal_dimension_worst <dbl>


Finally, we got rid of all the missing values, so the modified data is ready to use for further analysis.


Let’s looking into correlation matrix to see correlation between all the variables

matrixData <- cor(data[sapply(data,is.numeric)], method="pearson")
# Rcolorbrewer palette
coul <- colorRampPalette(brewer.pal(8, "PiYG"))(25)
heatmap(matrixData, scale="column", col = coul)

corrplot(matrixData, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

#data <- sapply(data,is.numeric)
data.mean <- cor(data[,c(3:12)],method="pearson")
data.se <- cor(data[,c(13:22)],method="pearson")
data.worst <- cor(data[,c(23:32)],method="pearson")


corrplot(data.mean, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

corrplot(data.se, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)

corrplot(data.worst, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)


table(data$diagnosis)
## 
##   B   M 
## 357 212
count(data, diagnosis) %>% mutate(relative_freq = (n/sum(n))) -> relative_freq
relative_freq
## # A tibble: 2 x 3
##   diagnosis     n relative_freq
##   <chr>     <int>         <dbl>
## 1 B           357         0.627
## 2 M           212         0.373
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) + 
  geom_bar() +
  scale_fill_brewer(palette = "Set1") +
  theme(legend.position="none") + labs(title= "Barplot representing two different tumors")

pairs.panels(data[,c(3:12)], main="Cancer Mean")

pairs.panels(data[,c(13:22)], main="Cancer SE")

pairs.panels(data[,c(23:32)], main="Cancer Worst")

Now we will construct 9 different violin plots for radius, perimeter and area of the spread of tumor in the body of patient based on the mean, se and worst.

##Violin plot representing Radius Mean distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = radius_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Mean distribution by diagnosis")

##Violin plot representing Radius_Se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = radius_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Se distribution by diagnosis")

##Violin plot representing Radius worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = radius_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Radius Worst distribution by diagnosis")

##Violin plot representing area Mean distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Mean distribution by diagnosis")

##Violin plot representing area se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area_Se distribution by diagnosis")

##Violin plot representing area worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = area_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Worst distribution by diagnosis")

##Violin plot representing perimeter_Se distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter_se distribution by diagnosis")

##Violin plot representing perimeter Mean distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "perimeter mean distribution by diagnosis")

##Violin plot representing perimeter worst distribution by diagnosis

ggplot(data, aes(x = diagnosis,
y = perimeter_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter Worst distribution by diagnosis")


Let’s split the data now to see how tumors differ for M and B

cancer_split <- split(data, data$diagnosis)
##cancer_train <- training(cancer_split)
##cancer_test <- testing(cancer_split)
dataB <- cancer_split$B
dataM <- cancer_split$M
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) + 
  geom_bar() + labs(title= "Barplot representing two different tumors")


Now we have two different datasets for B and M